OkCupid is a mobile dating app. It sets itself apart from other dating apps by making use of a pre computed compatibility score, calculated by optional questions the users may choose to answer.
In this dataset, there are 60k records containing structured information such as age, sex, orientation as well as text data from open ended descriptions.
Here is the link of the OkCupid dataset.
import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
pd.options.display.float_format = '{:,.2f}'.format
df_data_raw = pd.read_csv('okcupid_profiles.csv')
df_data = df_data_raw.copy()
df_data.shape
(59946, 31)
df_data.columns
Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
dtype='object')
df_data.sample(3)
| age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | essay0 | essay1 | essay2 | essay3 | essay4 | essay5 | essay6 | essay7 | essay8 | essay9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13135 | 33 | single | f | straight | athletic | NaN | often | never | graduated from masters program | asian | ... | hello! i'm a transplant settling into the bay... | learning to live it abundantly. during the da... | peddling tim keller sermons and radiolab episodes | NaN | mere christianity, crime and punishment, flow,... | sisters, flip flops, apple, microbrew, faith i... | faith, understanding my heart & others', my fa... | happy houring, usually beer. out at dinner wit... | sometimes, just sometimes, i go to bed without... | you think we could be friends. =) |
| 39341 | 26 | single | m | straight | athletic | strictly anything | socially | NaN | graduated from college/university | white | ... | enjoying life back in the bay after spending a... | i have a job in finance in downtown sf. outsid... | not taking myself too seriously procrastinatin... | NaN | i have a serious appreciation for will ferrell... | 1. skis or a bike depending on the season 2. r... | the snow forecast for the sierra, tetons, and ... | out and about in sf with friends or cruising i... | i joined an online dating site ? | NaN |
| 27248 | 52 | single | m | straight | fit | NaN | socially | never | graduated from ph.d program | other | ... | i don't normally think in these terms; to be a... | living it | teaching when i am asked, helping when i can, ... | have to ask my friends what that is | metamorphosis and zen and the art of motorcycl... | air, sunshine, water, food, friends and family | i am actually trying to quiet down my mind so ... | sharing a beer with friends and open to sugges... | what? who? me! | you are real, solid and down to earth with a l... |
3 rows × 31 columns
df_data.duplicated().sum()
0
df_data.isna().sum()
age 0 status 0 sex 0 orientation 0 body_type 5296 diet 24395 drinks 2985 drugs 14080 education 6628 ethnicity 5680 height 3 income 0 job 8198 last_online 0 location 0 offspring 35561 pets 19921 religion 20226 sign 11056 smokes 5512 speaks 50 essay0 5488 essay1 7572 essay2 9638 essay3 11476 essay4 10537 essay5 10850 essay6 13771 essay7 12451 essay8 19225 essay9 12603 dtype: int64
fig = px.sunburst(df_data, path=['sex','orientation','status'],
title="OkCupid Users Sex, Orientation and Status")
fig.update_traces(textinfo="label+percent entry",insidetextorientation="horizontal")
fig.show()
df_data.location.unique()
array(['south san francisco, california', 'oakland, california',
'san francisco, california', 'berkeley, california',
'belvedere tiburon, california', 'san mateo, california',
'daly city, california', 'san leandro, california',
'atherton, california', 'san rafael, california',
'walnut creek, california', 'menlo park, california',
'belmont, california', 'san jose, california',
'palo alto, california', 'emeryville, california',
'el granada, california', 'castro valley, california',
'fairfax, california', 'mountain view, california',
'burlingame, california', 'martinez, california',
'pleasant hill, california', 'hayward, california',
'alameda, california', 'vallejo, california',
'benicia, california', 'el cerrito, california',
'mill valley, california', 'richmond, california',
'redwood city, california', 'el sobrante, california',
'stanford, california', 'san pablo, california',
'novato, california', 'pacifica, california',
'lafayette, california', 'half moon bay, california',
'fremont, california', 'orinda, california',
'san anselmo, california', 'corte madera, california',
'albany, california', 'san carlos, california',
'san lorenzo, california', 'foster city, california',
'hercules, california', 'santa cruz, california',
'bolinas, california', 'sausalito, california',
'millbrae, california', 'larkspur, california',
'moraga, california', 'san bruno, california',
'petaluma, california', 'pinole, california',
'san geronimo, california', 'crockett, california',
'boulder, colorado', 'brisbane, california', 'freedom, california',
'montara, california', 'green brae, california',
'woodside, california', 'new york, new york', 'ross, california',
'east palo alto, california', 'san quentin, california',
'portland, oregon', 'rodeo, california',
'hacienda heights, california', 'woodacre, california',
'westlake, california', 'riverside, california',
'rohnert park, california', 'sacramento, california',
'point richmond, california', 'san diego, california',
'canyon country, california', 'tucson, arizona',
'honolulu, hawaii', 'billings, montana',
'west oakland, california', 'kentfield, california',
'milwaukee, wisconsin', 'woodbridge, virginia',
'glencove, california', 'tiburon, california', 'madrid, spain',
'las vegas, nevada', 'peoria, illinois',
'santa monica, california', 'bellwood, illinois',
'los angeles, california', 'moss beach, california',
'nha trang, vietnam', 'hillsborough, california',
'olema, california', 'union city, california', 'colma, california',
'cork, ireland', 'new orleans, louisiana',
'kensington, california', 'redwood shores, california',
'utica, michigan', 'brea, california', 'lagunitas, california',
'stinson beach, california', 'santa clara, california',
'studio city, california', 'concord, california',
'piedmont, california', 'grand rapids, michigan',
'seaside, california', 'leander, texas',
'forest knolls, california', 'edinburgh, united kingdom',
'magalia, california', 'london, united kingdom',
'astoria, new york', 'chicago, illinois', 'orange, california',
'south wellfleet, massachusetts', 'bayshore, california',
'asheville, north carolina', 'los gatos, california',
'boise, idaho', 'islip terrace, new york', 'sunnyvale, california',
'cambridge, massachusetts', 'lake orion, michigan',
'ozone park, new york', 'jackson, mississippi',
'ashland, california', 'south orange, new jersey',
'fort lauderdale, florida', 'minneapolis, minnesota',
'pasadena, california', 'atlanta, georgia', 'salt lake city, utah',
'arcadia, california', 'milpitas, california',
'san antonio, texas', 'port costa, california',
'nicasio, california', 'livingston, california',
'bellingham, washington', 'crowley, texas',
'boston, massachusetts', 'longwood, florida',
'fayetteville, west virginia', 'granite bay, california',
'isla vista, california', 'hilarita, california',
'campbell, california', 'stratford, connecticut',
'santa ana, california', 'santa rosa, california', 'kula, hawaii',
'murfreesboro, tennessee', 'brooklyn, new york',
'north hollywood, california', 'nevada city, california',
'providence, rhode island', 'stockton, california',
'marin city, california', 'washington, district of columbia',
'waterford, california', 'vancouver, british columbia, canada',
'muir beach, california', 'pacheco, california',
'irvine, california', 'kansas city, missouri', 'kassel, germany',
'canyon, california', 'philadelphia, pennsylvania',
'oceanview, california', 'long beach, new york',
'amsterdam, netherlands', 'taunton, massachusetts',
'napa, california', 'austin, texas', 'san luis obispo, california',
'modesto, california', 'bonaduz, switzerland',
'costa mesa, california', 'guadalajara, mexico',
'oakley, california', 'columbus, ohio', 'chico, california',
'south lake tahoe, california', 'vacaville, california',
'miami, florida', 'long beach, california', 'denver, colorado',
'seattle, washington', 'cincinnati, ohio', 'phoenix, arizona',
'rochester, michigan'], dtype=object)
df_locations = df_data.location[df_data["location"].str.contains(", california")]
df_locations = pd.DataFrame(df_locations)
df_locations.to_csv("locations_cali.csv")
df_loc_unique = df_locations.unique()
df_loc_unique = pd.DataFrame(df_loc_unique)
from opencage.geocoder import OpenCageGeocode
key = '8b390f6c0a1141cfbb7aca8133968224'
geocoder = OpenCageGeocode(key)
lat_list = []
long_list = []
for location in df_loc_unique[0]:
results = geocoder.geocode(location)
lat_list.append(results[0]['geometry']['lat'])
long_list.append(results[0]['geometry']['lng'])
df_loc_unique['lat'] = np.asarray(lat_list).flatten()
df_loc_unique['long'] = np.asarray(long_list).flatten()
full_lat_list = []
full_long_list = []
for location in df_locations["location"]:
for uni_loc in df_loc_unique[0]:
if location == uni_loc:
full_lat_list.append(df_loc_unique[df_loc_unique[0] == str(location)]["lat"])
full_long_list.append(df_loc_unique[df_loc_unique[0] == str(location)]["long"])
df_locations["lat"] = np.asarray(full_lat_list).flatten()
df_locations["long"] = np.asarray(full_long_list).flatten()
df_locations.to_csv("users_locations_latlong.csv")

df_age_loc = df_data.iloc[:, [14,0]]
df_agemed_loc = df_age_loc.groupby(df_age_loc["location"]).median()
df_agemed_loc = df_agemed_loc[df_agemed_loc.index.str.contains(", california")]
df_agemed_loc.to_csv("agemed_sf.csv")

df_inc_loc = df_data.iloc[:, [14,11]]
df_inc_loc = df_inc_loc[df_inc_loc.income > 0]
df_inc_loc = df_inc_loc[df_inc_loc["location"].str.contains(", california")]
df_inc_loc = df_inc_loc.groupby(df_age_loc["location"]).median()
df_inc_loc.to_csv("income_loc_sf.csv")
.png?raw=true)
df_data.sort_values("age", ascending=False).head(3)
| age | status | sex | orientation | body_type | diet | drinks | drugs | education | ethnicity | ... | essay0 | essay1 | essay2 | essay3 | essay4 | essay5 | essay6 | essay7 | essay8 | essay9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2512 | 110 | single | f | straight | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25324 | 109 | available | m | straight | athletic | mostly other | NaN | never | working on masters program | NaN | ... | NaN | NaN | NaN | nothing | NaN | NaN | NaN | NaN | NaN | NaN |
| 25051 | 69 | single | m | straight | average | anything | rarely | never | graduated from college/university | other | ... | retired...born in the usa... i'm a straight ma... | loving, laughing, living, and learning. | most things! | smile | bible, and many more! former member of... smpt... | god first, second, love my neighbors as i love... | isaiah 53; and the 1st, 2nd...matthew 6:33 and... | there is know typical friday for me! | that's why its private! i might tell you when ... | if interested contact me.....i'm a student of,... |
3 rows × 31 columns
df_data = df_data[df_data["age"] < 100]
sns.set_style("darkgrid")
sns.set_context("talk")
sns.set(rc={'figure.figsize':(14,6)})
test = sns.histplot(df_data, x=df_data.age,bins=range(18,71,4),hue="sex", multiple="dodge").set_title("OkCupid Users Age")
plt.xlim([18, 70])
plt.xlabel("Age")
plt.xticks(range(18,71,4))
plt.show()
df_job_inc = df_data.iloc[:, [12,11,2]]
df_job_inc = df_job_inc.dropna()
sns.set_style("darkgrid")
sns.set_context("talk")
sns.set(rc={'figure.figsize':(14,6)})
sns.countplot(data= df_job_inc, x=df_job_inc.job,palette="rocket", order = df_job_inc.job.value_counts().index).set_title("OkCupid Users Occupation")
plt.tick_params(axis='x', rotation=90)
plt.xlabel("Occupation")
plt.show()
df_job_inc = df_job_inc[df_job_inc.income > 0]
df_job_inc.job.unique()
array(['hospitality / travel', 'student',
'banking / financial / real estate', 'sales / marketing / biz dev',
'other', 'construction / craftsmanship',
'artistic / musical / writer', 'education / academia',
'law / legal services', 'entertainment / media',
'executive / management', 'medicine / health',
'computer / hardware / software', 'science / tech / engineering',
'transportation', 'retired', 'rather not say',
'political / government', 'unemployed', 'military',
'clerical / administrative'], dtype=object)
df_job_inc = df_job_inc.drop(df_job_inc[df_job_inc.job.isin(["student","other","retired","rather not say","unemployed"])].index)
sns.set(rc={'figure.figsize':(44,15)})
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x="job", y="income", data=df_job_inc, showfliers = False)
plt.ylim([10000, 251000])
plt.show()
df_data = df_data.replace({'drugs' : { 'never' : 0, 'often' : 1, 'sometimes' : 1 }})
df_data_drugs = df_data.dropna(subset=['drugs'])
df_sum_drugs = df_data_drugs[df_data["drugs"] != 2].groupby("age").sum()["drugs"]
<ipython-input-350-f9254d51bb43>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
df_age_drugs = df_data_drugs.value_counts("age").sort_index()
df_drugs_con_by_age = pd.concat([df_sum_drugs, df_age_drugs], axis=1)
plt.plot(df_drugs_con_by_age.index, df_drugs_con_by_age["drugs"]/df_drugs_con_by_age[0], color='red', marker='o')
plt.title('Drugs Consumption Rate Vs Age', fontsize=14)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Drugs Consumption Rate', fontsize=14)
plt.grid(True)
plt.show()
df_data_men = df_data[df_data["sex"] == "m"]
df_data_women = df_data[df_data["sex"] == "f"]
stopwords = set(STOPWORDS)
stopwords.update(["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"])
mask = np.array(Image.open("./man_sil.jpeg"))
text = " ".join(review for review in df_data_men.essay0.astype(str))
wc = WordCloud(background_color="white",
mask=mask,
stopwords=stopwords,
min_word_length=4,
scale=3).generate(text)
plt.figure( figsize=[20,20])
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
mask2 = np.array(Image.open("./woman_sil.jpeg"))
text2 = " ".join(review for review in df_data_women.essay0.astype(str))
wc2 = WordCloud(background_color="white",
mask=mask2,
stopwords=stopwords,
min_word_length=4,
scale=3).generate(text2)
plt.figure( figsize=[20,20])
plt.imshow(wc2, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()